/*
Cleans union basic info from CNES; gender of union representatives; contracts from CNES (at contract, and pair-year level)
		// input: Sindicatos_Regiao_`region'; Federacoes; Confederacoes.xlsx; 
		//        AfericaoCentral_2012-2016; _reps; repids_gender;
		//        contracts_cnes; estabchars_wide
		// output: union_list_clean; CBA_reps; contracts_act; 
		//         estab_union_act; filledpanel

Cleaning steps:

1. Assign treatment status to each union --> union-level dataset
	* Input: "$raw/CNES/Sindicatos_Regiao_`region'.xlsx"; "$raw/CNES/Federacoes.xlsx"; "$raw/CNES/Confederacoes.xlsx"; "$raw/CNES/AfericaoCentral_2012-2016.xlsx"
	* Output: "$files/union_list_clean"
	
2. Determine gender of each union representative --> contract-representative level dataset
	* Input: "$raw/_reps.txt"
	* Output: "$files/CBA_reps.dta"

3. Clean contract data (only valid, non amendment ACTs) --> contract level dataset
	3a: attach treatment status --> drop if treatment status not unique at contract level (0.33% of contracts)
	3b: attach data on gender of rep
	3c: attach to contract level data the unions id in wide form
	3d: clean clauses
		* Input: "$files/unions/contracts_cnes.dta"
		* Output: "$files/contracts_act.dta"
	
4. Data at pair-year level (pair = signing establishment - union)
		* Input:  "$files/unions/contracts_cnes.dta",  "$files/contracts_act.dta",  "$files/union_list_clean", "$files/estabchars_wide.dta"
		* Output: "$files/estab_union_act.dta" - THIS IS OUR "NEW CONTRACT DATA"

5. Fill in panel at establishment-union-year level
		* Input: "$files/unions/contracts_cnes.dta", "$files/contracts_act.dta", "$files/estab_union_act.dta"
		* Output: "$files/filledpanel.dta" - THIS IS OUR "FILLED PANEL DATA"

*/



cap log close
cap log using "$logs/cleaning_cnes", replace 

****************
*** Step 1: UNIONS
****************

	* Import union lists
		local i = 1 
		local regions Centro_Oeste Nordeste Norte Sudeste Sul 
		foreach region of local regions {		
			import excel using "$raw/CNES/Sindicatos_Regiao_`region'.xlsx", firstrow clear
			keep CNPJ Grau Denominação Grupo Abrangência BaseTerritorial Categoria Número NomedoPresidente Região
			rename CNPJ union_id
			rename Grau level
			rename Denominação union_name_full
			rename Grupo union_type
			rename Abrangência geography_level
			rename BaseTerritorial geography
			rename Categoria worker_occupation
			rename Número number // likely number of members as of 2017
			rename NomedoPresidente president_name // likely incumbent in 2017
			rename Região region
			tempfile union`i'
			save `union`i''
			local ++i
		}
	
	* Import federation lists
		import excel using "$raw/CNES/Federacoes.xlsx", firstrow clear
		keep CNPJ Grau Denominação Grupo Abrangência BaseTerritorial Categoria Número NomedoPresidente Região
		rename CNPJ union_id
		rename Grau level
		rename Denominação union_name_full
		rename Grupo worker_type
		rename Abrangência geography_level
		rename BaseTerritorial geography
		rename Categoria worker_occupation
		rename Número number // likely number of members as of 2017
		rename NomedoPresidente president_name // likely incumbent in 2017
		rename Região region
		tempfile union6
		save `union6'

	* Import confederation lists
		import excel using "$raw/CNES/Confederacoes.xlsx", firstrow clear
		keep CNPJ Grau Denominação Grupo Abrangência Categoria Número NomedoPresidente 
		rename CNPJ union_id
		rename Grau level
		rename Denominação union_name_full
		rename Grupo worker_type
		rename Abrangência geography_level
		rename Categoria worker_occupation
		rename Número number // likely number of members as of 2017
		rename NomedoPresidente president_name // likely incumbent in 2017
		tempfile union7
		save `union7'
		
	* Append and all union files
		use `union1'
		forvalues i = 2/7 {
			append using `union`i''
		}
		encode region, gen(region_num)
		label def regions 1 "CW" 2 "NE" 3 "N" 4 "SE" 5 "S"
		label values region_num regions
		drop region
		rename region_num region
		drop if union_id == ""
		save "$files/union_list_clean.dta", replace
		
	* Add union central affiliation
		forvalues y = 2012/2016 {
			import excel using "$raw/CNES/AfericaoCentral_2012-2016.xlsx", sheet("cs`y'") firstrow clear
			drop cs_abb
			rename cs_code cs
			gen year = `y'
			
			
			tempfile aff`y'
			save `aff`y''
		}
	//append and save
		use `aff2012'
		append using `aff2013'
		append using `aff2014'
		append using `aff2015'
		append using `aff2016'
		reshape wide uf filiados cs, i(cnpj) j(year)
		order cnpj cs* uf* filiados*
		rename cnpj union_id
		save "$files/union_affiliation.dta", replace
	
	* Merge unions with central affiliation
		use "$files/union_list_clean.dta", clear
		merge 1:1 union_id using "$files/union_affiliation.dta"
		drop _merge
		
	// CUT affiliation of union (indicator conditional on having info on affiliation in 2012)
		gen CUT = (cs2012 == 8) if cs2012!=.
		save "$files/union_list_clean.dta", replace


	
****************
*** Step 2: UNIONS' REPS
****************

		* Names of representatives to classify
		import delimited "$raw/_reps.txt", delimiter("|") clear
		rename (v1 v2 v3 v4 v5) (contract_id regis rep_name rep_title cnpj)
		egen rep_id = group(rep_name)	
		tempfile reps
		save `reps'

		split rep_name, p(" ")

		duplicates drop rep_id, force
		keep rep_name* rep_id
		export delimited rep_name* rep_id using  "$files/repids.csv", replace

		/* Running genderBR on repids.csv generates "$files/genderBR/repids_gender.csv"
			which requires some manual modifications to reduce the number of missing
			gender information for the boards, which is why the file is already provide in
			the replication package
			
			To get the file that we later modified manually, uncomment the next three lines:
		
			local infile "$files/repids.csv"
			local outfile "$files/genderBR/repids_gender.csv"
			shell "$RPATH" "$CODEPATH/get_gender_fromnames.R" "`infile'" "`outfile'"
		*/

		* Get "gendered" representatives, match them back to their CBAs with a Pres/VP identifier
		import delimited using "$files/genderBR/repids_gender.csv", clear varnames(1)
		keep rep_name gender rep_id
		duplicates drop rep_id, force
		destring rep_id, replace force
		merge 1:m rep_id using `reps'
		gen P_or_VP = (inlist(rep_title, "Presidente", "Vice - Presidente", "Vice-Presidente")) if rep_title!=""
		drop if rep_name==""
		drop _merge 
		rename cnpj union_id 
		save "$files/CBA_reps.dta", replace

		
****************
*** Step 3: CBAs
****************
	
		use "$files/unions/contracts_cnes.dta", clear
		gen year=year(start_mdy)
		keep if year>=2009
		
		* Valid (legal), Extrato (original and not amendment), ACT (firm-level and not sectoral)
		keep if (valid==1)&(act==1)&(extrato==1)
		unique contract_id 
		
		/* 3A. Gen treatment status of contract */
			merge m:1 union_id using "$files/union_list_clean.dta", keepusing(CUT)
			drop if _m == 2
			drop _merge
			egen maxtreat = max(CUT) if CUT!=. , by(contract_id)
			egen mintreat = min(CUT) if CUT!=. , by(contract_id)	
			* Drop the 0.33% of contracts with non unique treatment status
			unique contract_id if maxtreat != mintreat
			count if maxtreat != mintreat
			drop if maxtreat != mintreat 
			rename maxtreat treat
			drop mintreat
		
		/* 3B. Attach gender of signing rep */
			* Number of signing unions + signing estab 
			rename numb_entities numb_entities_old
			gen numb_entities = numb_unions + numb_employers
			* Keep only contract-union-level variables
			keep contract_id union_id geog_type start_mdy end_mdy numb_clauses cl_* ///
			year treat numb_entities numb_employers numb_unions
			duplicates drop
			merge 1:m contract_id union_id using "$files/CBA_reps.dta", keepusing(rep_id gender)
			drop if _m == 2
			drop _merge
			* Gender of union rep at contract_id level
			gen fem1 =(gender=="Female") if gender!="NA"
			gen n = 1 if gender!="NA"
			bysort contract_id: egen num_unionrep = sum(n)
			bysort contract_id: egen num_unionrepF = sum(fem1)
			gen sh_fem_rep = num_unionrepF/num_unionrep
			label var sh_fem_rep "% of reps signing the firm's CBA that are female"
			label var num_unionrep "Number of union reps"
			label var num_unionrepF "Number of women union reps"
			drop n fem1
			* Dummy for any female rep 
			gen any_femrep = (sh_fem_rep>0) if sh_fem_rep!=. 
			lab var any_femrep "A female rep signs the firm's CBA"
		
		/* 3C. Contract level data */
			drop rep_id gender union_id 
			duplicates drop 
			unique contract_id
			* Gen post and treat*post variables
			//based on the start date of the CBA (relative to the CUT reform)
			gen reform = date("04/01/2015", "MDY")
			gen post = start_mdy > reform if start_mdy!=. 
			gen treatpost = treat * post
			* Gen type of contract by who signs it
			gen type_sign = .
			replace type_sign = 1 if numb_employers == numb_unions ==1 // one employer - one union
			replace type_sign = 2 if numb_employers >1 & numb_unions ==1 // multi employers - one union
			replace type_sign = 3 if numb_employers ==1 & numb_unions >1 // one employer - multiple unions
			replace type_sign = 4 if numb_employers >1 & numb_unions >1 // multiple employers - multiple unions
			* Save contract data
			// drop 5% of contracts because they have union central affiliation information in 2012
			mdesc treat post
			drop if treat ==.
			isid contract_id
			order contract_id year treat geog_type numb_clauses numb_employers numb_unions numb_entities, first
			save "$files/contracts_act.dta", replace
			* Attach union ids (contract x union observations)
			use "$files/unions/contracts_cnes.dta", clear
			keep contract_id union_id 
			duplicates drop
			merge m:1 contract_id using "$files/contracts_act.dta"
			keep if _merge == 3 
			drop _merge
			* reshape (unique contract observations)
			bysort contract_id: gen union_no = _n 
			reshape wide union_id, i(contract_id) j(union_no)
			save "$files/contracts_act.dta", replace

	/* 3D. Categorize clauses */
		* Binary clause variables
		cap drop cl_0* 
		foreach xxx of varlist cl* {
			gen b`xxx' = (`xxx'>0)&(`xxx'<.)
		}
		
		/* Total number of clauses */
		egen total_clauses = rowtotal(cl*)
		lab var total_clauses "Total number of clauses"
		egen total_clauses_b = rowtotal(bcl*)
		lab var total_clauses_b "Total number of clauses (binary)"
		
		* Childcare assistance, maternity assistance, abortion protections, maternity protections, paternity protections, policy for dependents
			egen maternity_clauses =rowtotal(cl_23aux_cre cl_23aux_mat cl_43est_abo cl_43est_mae cl_43est_pai cl_43pol_par)
			lab var maternity_clauses "Total maternity clauses"
			
			egen maternity_clauses_b =rowtotal(bcl_23aux_cre bcl_23aux_mat bcl_43est_abo bcl_43est_mae bcl_43est_pai bcl_43pol_par)
			lab var maternity_clauses_b "Total maternity clauses (binary)"
			
			gen any_maternity_cl = (maternity_clauses>0) & maternity_clauses!=. 
			lab var any_maternity_cl "Has any maternity/childcare clause"
			
		* Discrimination and harassment related clauses
		* Sexual harassment, equal opportunities
		
			egen harassment_clauses=rowtotal(cl_42ass_sex cl_42igu_opo)
			lab var harassment_clauses "Total harassment clauses"
			
			egen harassment_clauses_b=rowtotal(bcl_42ass_sex bcl_42igu_opo)
			lab var harassment_clauses_b "Total harassment clauses (binary)"
			
			gen any_harassment_cl = (harassment_clauses>0) & harassment_clauses!=.
			lab var any_harassment_cl "Has any harassment clause"

		* Flexibility related clauses
		* Part-time contracts, work authorization on Sundays and holidays, workday controls, 
		* weekly rest, duration and schedule, absences, break intervals, special shifts, other provisions
		* of the workday, extension/reduction of the workday, on-call, uninterrupted shifts
			
			* workday controls + special shifts (mostly about breast-feeding) + on-call + uninterrupted shifts
			egen flexibility_clauses = rowtotal(cl_51con_jor cl_51jor_esp cl_51sob cl_51tur_ini)
			* + part-time
			egen flexibility2_clauses = rowtotal(cl_51con_jor cl_51jor_esp cl_51sob cl_51tur_ini cl_32con_tem)
			
			lab var flexibility_clauses "Total flexiblity clauses"
			lab var flexibility2_clauses "Total flexiblity clauses + part-time"
			
			* workday controls + special shifts (mostly about breast-feeding) + on-call + uninterrupted shifts
			egen flexibility_clauses_b = rowtotal(bcl_51con_jor bcl_51jor_esp bcl_51sob bcl_51tur_ini)
			* + part-time
			egen flexibility2_clauses_b = rowtotal(bcl_51con_jor bcl_51jor_esp bcl_51sob bcl_51tur_ini bcl_32con_tem)
			
			lab var flexibility_clauses_b "Total flexiblity clauses (binary)"
			lab var flexibility2_clauses_b "Total flexiblity clauses + part-time (binary)"
			
			gen any_flexibility_cl = (flexibility_clauses>0) & flexibility_clauses!=.
			lab var any_flexibility_cl "Has any flexibility clause"
			
			gen any_flexibility2_cl = (flexibility2_clauses>0) & flexibility2_clauses!=.
			lab var any_flexibility2_cl "Has any flexibility clause + part-time"
			
		* Leave related clauses
		* female workforce, abortion leave, adoption leave, maternity leave, unpaid leave, paid leave, other provisions on holidays and leaves
			egen leave_clauses =rowtotal(cl_33maoobr_fem cl_72lic_abo cl_72lic_ado cl_72lic_mat cl_72lic_nao cl_72lic_rem cl_73out_dis)
			lab var leave_clauses "Total leave clauses"

			egen leave_clauses_b =rowtotal(bcl_33maoobr_fem bcl_72lic_abo bcl_72lic_ado bcl_72lic_mat bcl_72lic_nao bcl_72lic_rem bcl_73out_dis)
			lab var leave_clauses_b "Total leave clauses (binary)"
			
			gen any_leave_cl = (leave_clauses>0) & leave_clauses!=. 
			lab var any_leave_cl "Has any leave clause"
			
		* Any female-specific amenity

			gen female_clauses =  maternity_clauses + harassment_clauses + flexibility2_clauses + leave_clauses
			gen female_clauses_b =  maternity_clauses_b + harassment_clauses_b + flexibility2_clauses_b + leave_clauses_b
			gen any_female_clauses_cl = (female_clauses >0 ) & female_clauses!=. 
			
			gen nonfemale_clauses = total_clauses - female_clauses
			gen nonfemale_clauses_b = total_clauses_b - female_clauses_b

			lab var female_clauses "Total female-specific clauses"
			lab var female_clauses_b "Total female-specific clauses (binary)"
			lab var any_female_cl "Has any female-specific clause"
			
		* Share of female specific 
			gen female_sh = female_clauses/ total_clauses
			label var female_sh "% female-specific amenities"

			gen female_sh_b = female_clauses_b/ total_clauses_b
			label var female_sh_b "% female-specific amenities (binary)"
					
		save "$files/contracts_act.dta", replace

		
		
*************************
*** Step 4 
*************************
	use "$files/unions/contracts_cnes.dta", clear
	keep contract_id employer_id union_id 
	merge m:1 contract_id using "$files/contracts_act.dta"
	keep if _merge == 3
	drop _merge
	* Tag establishment-years with more than one observation
	//from negotiating with multiple unions on same contract or from having multiple contracts in a year!
	gen n = 1
	bys employer_id year: egen numcont=sum(n)
	gen morethan1=(numcont>1)
	drop numcont
	*Take max clauses for each estab-union-year 
	//for 2015 if first CBA starts prior to the reform, the whole year is considered pre-period for the estab-union
	//for 2015 if first CBA starts after to the reform, the whole year is considered post-period for the estab-union
	collapse (max) cl_* morethan1 any_femrep sh_fem_rep ///
			 (min) post ///
			 (first) contract_id ///
			 , by(employer_id union_id year)
	tab year post
	rename contract_id cntrct_id
	
	* merge with treatment status of union
	merge m:1 union_id using  "$files/union_list_clean.dta", keepusing(CUT)
	drop if _m == 2
	drop _merge
	gen treat = CUT 
	tab treat 
	gen treatpost = treat * post
	mdesc treat post //will drop missing later
	
	* Tag establishments always negotiating with 1, same, union
	bysort employer_id union_id: gen temp = _n
	replace temp = 0 if temp>1 
	bysort employer_id: egen temp2 = total(temp)
	gen only1union = (temp2 == 1)
	drop temp temp2
	
	/* Categorize clauses */
		* Binary clause variables
		cap drop cl_0* 
		foreach xxx of varlist cl* {
			gen b`xxx' = (`xxx'>0)&(`xxx'<.)
		}
		
		/* Total number of clauses */
		egen total_clauses = rowtotal(cl*)
		lab var total_clauses "Total number of clauses"
		egen total_clauses_b = rowtotal(bcl*)
		lab var total_clauses_b "Total number of clauses (binary)"

		* Childcare assistance, maternity assistance, abortion protections, maternity protections, paternity protections, policy for dependents
			egen maternity_clauses =rowtotal(cl_23aux_cre cl_23aux_mat cl_43est_abo cl_43est_mae cl_43est_pai cl_43pol_par)
			lab var maternity_clauses "Total maternity clauses"
			
			egen maternity_clauses_b =rowtotal(bcl_23aux_cre bcl_23aux_mat bcl_43est_abo bcl_43est_mae bcl_43est_pai bcl_43pol_par)
			lab var maternity_clauses_b "Total maternity clauses (binary)"
			
			gen any_maternity_cl = (maternity_clauses>0) & maternity_clauses!=. 
			lab var any_maternity_cl "Has any maternity/childcare clause"
			
		* Discrimination and harassment related clauses
		* Sexual harassment, equal opportunities
		
			egen harassment_clauses=rowtotal(cl_42ass_sex cl_42igu_opo)
			lab var harassment_clauses "Total harassment clauses"
			
			egen harassment_clauses_b=rowtotal(bcl_42ass_sex bcl_42igu_opo)
			lab var harassment_clauses_b "Total harassment clauses (binary)"
			
			gen any_harassment_cl = (harassment_clauses>0) & harassment_clauses!=.
			lab var any_harassment_cl "Has any harassment clause"

		* Flexibility related clauses
		* Part-time contracts, work authorization on Sundays and holidays, workday controls, 
		* weekly rest, duration and schedule, absences, break intervals, special shifts, other provisions
		* of the workday, extension/reduction of the workday, on-call, uninterrupted shifts
			
			* workday controls + special shifts (mostly about breast-feeding) + on-call + uninterrupted shifts
			egen flexibility_clauses = rowtotal(cl_51con_jor cl_51jor_esp cl_51sob cl_51tur_ini)
			* + part-time
			egen flexibility2_clauses = rowtotal(cl_51con_jor cl_51jor_esp cl_51sob cl_51tur_ini cl_32con_tem)
			
			lab var flexibility_clauses "Total flexiblity clauses"
			lab var flexibility2_clauses "Total flexiblity clauses + part-time"
			
			* workday controls + special shifts (mostly about breast-feeding) + on-call + uninterrupted shifts
			egen flexibility_clauses_b = rowtotal(bcl_51con_jor bcl_51jor_esp bcl_51sob bcl_51tur_ini)
			* + part-time
			egen flexibility2_clauses_b = rowtotal(bcl_51con_jor bcl_51jor_esp bcl_51sob bcl_51tur_ini bcl_32con_tem)
			
			lab var flexibility_clauses_b "Total flexiblity clauses (binary)"
			lab var flexibility2_clauses_b "Total flexiblity clauses + part-time (binary)"
			
			gen any_flexibility_cl = (flexibility_clauses>0) & flexibility_clauses!=.
			lab var any_flexibility_cl "Has any flexibility clause"
			
			gen any_flexibility2_cl = (flexibility2_clauses>0) & flexibility2_clauses!=.
			lab var any_flexibility2_cl "Has any flexibility clause + part-time"
			
		* Leave related clauses
		* female workforce, abortion leave, adoption leave, maternity leave, unpaid leave, paid leave, other provisions on holidays and leaves
			egen leave_clauses =rowtotal(cl_33maoobr_fem cl_72lic_abo cl_72lic_ado cl_72lic_mat cl_72lic_nao cl_72lic_rem cl_73out_dis)
			lab var leave_clauses "Total leave clauses"

			egen leave_clauses_b =rowtotal(bcl_33maoobr_fem bcl_72lic_abo bcl_72lic_ado bcl_72lic_mat bcl_72lic_nao bcl_72lic_rem bcl_73out_dis)
			lab var leave_clauses_b "Total leave clauses (binary)"
			
			gen any_leave_cl = (leave_clauses>0) & leave_clauses!=. 
			lab var any_leave_cl "Has any leave clause"
			
		* Any female-specific amenity

			gen female_clauses =  maternity_clauses + harassment_clauses + flexibility2_clauses + leave_clauses
			gen female_clauses_b =  maternity_clauses_b + harassment_clauses_b + flexibility2_clauses_b + leave_clauses_b
			gen any_female_clauses_cl = (female_clauses >0 ) & female_clauses!=. 
			
			gen nonfemale_clauses = total_clauses - female_clauses
			gen nonfemale_clauses_b = total_clauses_b - female_clauses_b

			lab var female_clauses "Total female-specific clauses"
			lab var female_clauses_b "Total female-specific clauses (binary)"
			lab var any_female_cl "Has any female-specific clause"
			
		* Share of female specific 
			gen female_sh = female_clauses/ total_clauses
			label var female_sh "% female-specific amenities"

			gen female_sh_b = female_clauses_b/ total_clauses_b
			label var female_sh_b "% female-specific amenities (binary)"
				
		/* Shares of each type */
		
		global type maternity harassment flexibility flexibility2 leave

		foreach t of global type {
			gen `t'_sh = `t'_clauses/total_clauses
			label var `t'_sh "% `t' clauses"
		}
		
	* Attach some establishment characteristics from RAIS: industry, microregion
	merge m:1 employer_id using "$files/estabchars_wide.dta", keepusing(ind_mode microregion_mode state_mode has_2014) keep(1 3)
	drop _merge

	egen estab_union_pair = group(employer_id union_id)
	unique estab_union_pair 
	unique employer_id 

	* Check treatment is unique at estab-union level		
	bys estab_union_pair: egen maxtreat = max(treat)
	bys estab_union_pair: egen mintreat = min(treat)
	count if maxtreat!= mintreat
	drop maxtreat mintreat 
	
	* Drops
	gen todrop = (treat==.)|(ind_mode==.)|(microregion_mode==.)|(has_2014==.)|(female_sh==.) 
	mdesc treat *_mode has_2014 female_sh
	tab todrop 
	drop if todrop==1
	drop todrop*
	unique estab_union_pair 
	unique employer_id 
	unique union_id 
	
	* 2digit industry
	cap drop temp 
	tostring ind_mode, gen(temp)
	gen lenind = length(temp) 
	replace temp = "0"* (5-lenind) + temp
	drop lenind
	gen lenind = length(temp) 
	tab lenind
	gen ind2d = substr(temp, 1, 2)
	tab ind2d // 59 values 
	drop temp lenind
	encode ind2d, gen(ind2d_num)
	
	* First year when CBA was negotiated (2009-2017)
	egen xxx = min(year), by(employer_id union_id)
	egen yyy = tag(employer_id union_id)
	tab xxx if yyy==1
	gen min_cbayear = xxx
	drop xxx yyy
	
	*Save 
	save "$files/estab_union_act.dta", replace
	
	

*************************
* Step 5: create full, balanced panel at estab-union-year level
************************
// Assumption: female clauses from previous contract carry over to next one (this is actually true after 2012)
// Before the first time an estab-union pair is observed, clauses are set to 0
		
* Start from all combinations of contract-employer-union
use "$files/unions/contracts_cnes.dta", clear
keep contract_id employer_id union_id 
* Keep only contracts in our sample 
merge m:1 contract_id using "$files/contracts_act.dta"
keep if _merge == 3
drop _merge
keep contract_id employer_id union_id end_mdy start_mdy cl_* year any_femrep sh_fem_rep
rename year start_year 
gen end_year = year(end_mdy)	
* Ensure pairs in filled panel are those in the new contracts sample
preserve	
	use "$files/estab_union_act.dta", clear
	keep employer_id union_id
	duplicates drop	
	tempfile temp
	save `temp'
restore
merge m:1 employer_id union_id using `temp'
keep if _merge==3
drop _merge

* Drop contracts expiring before June 2011 : we don't care, bc we want to start filling in the panel from 2011
	drop if end_mdy<date("06/30/2011", "MDY")
* Tag contracts expiring after 2012 reforms (their amenities count if not re-negotiated)
	gen reform2012 = date("09/25/2012", "MDY")
	gen expires_after_reform = (end_mdy > reform2012) if end_mdy!=. 
	
	
* get "effective" year --> if covers 6 or more months in that year
	forvalues year = 2011/2017 {
			gen len_`year' = date("12/31/`year'", "MDY") - start_mdy
			replace len_`year' = 0 if len_`year'<0 // replace to 0 if starts in following year 
			replace len_`year' = 0 if end_year<`year' // replace to 0 if ends in previous year
		}
	forvalues year = 2011/2017 {
			gen counts_for_`year' = (len_`year'>182)
		}	
	* for 2017 replace count to 1
	replace counts_for_2017 = 1 if start_year == 2017
	
	* some checks
	egen total_years = rowtotal(counts_for*)
	tab start_year if total_years == 0  
	
* Determine which clauses to drag forward for 2011-2012: collapse max clauses of contracts that expire after reform 
forvalues y = 2011/2012 {
    preserve
		keep if expires_after_reform==1 
		keep if counts_for_`y' == 1
		drop counts_for_`y' 
		collapse (max) cl_* any_femrep sh_fem_rep (first) contract_id, by(employer_id union_id)
		gen year =`y'
		rename cl_* cl_*_TDF
		rename any_femrep any_femrep_TDF
		rename sh_fem_rep sh_fem_rep_TDF
		rename contract_id cntrct_id
		tempfile tempA`y'
		save `tempA`y''
	restore
}	

forvalues y = 2011/2017 {
    preserve
		keep if counts_for_`y' == 1
		drop counts_for_`y'
		collapse (max) cl_* any_femrep sh_fem_rep (first) contract_id, by(employer_id union_id)
		gen year =`y'
		rename contract_id cntrct_id
		tempfile temp`y'
		save `temp`y''
	restore
}	
	
	use `temp2011', clear
	merge 1:1 employer_id union_id year using `tempA2011', nogen
	append using `temp2012'
	merge 1:1 employer_id union_id year using `tempA2012', nogen 
	append using `temp2013'
	append using `temp2014'
	append using `temp2015'
	append using `temp2016'
	append using `temp2017'
	
	isid employer_id union_id year
	save "$files/temp_filledpanel.dta", replace 
	
	use "$files/temp_filledpanel.dta", clear
	* Now for each estab-union-year we have 2 measures of clauses (only for 2011 and 2012)
	*	one that is the max over contracts in that year
	*	one that is the max over contracts in that year that expired after the 2012 reform 
	
	* Fill in the panel:
		* first, replace clauses to drag = clauses for years>= 2013
		unab allcl: cl_* any_femrep* sh_fem_rep*
		unab toexclude: *_TDF
		local toreplace: list allcl - toexclude
		
		foreach var of local toreplace {
			replace `var'_TDF = `var' if year>=2013 
		}
		* Then fill in the panel in following way:		
		egen estab_union_pair = group(employer_id union_id)
		xtset estab_union_pair year 
		tsfill , full
		* Start filling in panel with clauses to carry forward 
		unab allcl: cl_* any_femrep* sh_fem_rep*
		unab toexclude: *_TDF
		local toreplace: list allcl - toexclude
		
			foreach var of local toreplace {
					gen `var'_panel = `var'_TDF
					replace `var'_panel = l.`var'_TDF if `var'_panel ==. 	
					replace `var'_panel = l2.`var'_TDF if `var'_panel ==. 
					replace `var'_panel = l3.`var'_TDF if `var'_panel ==. 
					replace `var'_panel = l4.`var'_TDF if `var'_panel ==. 
					replace `var'_panel = l5.`var'_TDF if `var'_panel ==. 
					replace `var'_panel = l6.`var'_TDF if `var'_panel ==. 

					} 
		* In 2011 and 2012, replace with actual count (not just those expiring after reform)
		unab allcl: cl_* any_femrep* sh_fem_rep*
		unab toexclude: *_TDF *_panel 
		local toreplace: list allcl - toexclude
		
			foreach var of local toreplace {
				replace `var'_panel = `var' if `var'!=. & inrange(year, 2011, 2012)
			}
		* Replace missing to zeros
		unab allcl: cl_* 
		unab toexclude: *_TDF *_panel 
		local toreplace: list allcl - toexclude
		
			foreach var of local toreplace {
				replace `var'_panel = 0 if `var'_panel==. 
			}

		keep employer_id union_id  estab_union_pair year cntrct_id *panel
		rename *_panel * 
		
	* Fill in estab and union ids
	sort estab_union_pair year
	replace employer_id=employer_id[_n-1] if employer_id=="" & estab_union_pair == estab_union_pair[_n-1]
	replace employer_id=employer_id[_n+1] if employer_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace employer_id=employer_id[_n+1] if employer_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace employer_id=employer_id[_n+1] if employer_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace employer_id=employer_id[_n+1] if employer_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace employer_id=employer_id[_n+1] if employer_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace employer_id=employer_id[_n+1] if employer_id=="" & estab_union_pair == estab_union_pair[_n+1]
	count if employer_id==""

	replace union_id=union_id[_n-1] if union_id=="" & estab_union_pair == estab_union_pair[_n-1]
	replace union_id=union_id[_n+1] if union_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace union_id=union_id[_n+1] if union_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace union_id=union_id[_n+1] if union_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace union_id=union_id[_n+1] if union_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace union_id=union_id[_n+1] if union_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace union_id=union_id[_n+1] if union_id=="" & estab_union_pair == estab_union_pair[_n+1]
	count if union_id ==""
	
	replace cntrct_id=cntrct_id[_n-1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n-1]
	replace cntrct_id=cntrct_id[_n+1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace cntrct_id=cntrct_id[_n+1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace cntrct_id=cntrct_id[_n+1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace cntrct_id=cntrct_id[_n+1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace cntrct_id=cntrct_id[_n+1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n+1]
	replace cntrct_id=cntrct_id[_n+1] if cntrct_id=="" & estab_union_pair == estab_union_pair[_n+1]
	count if cntrct_id ==""
	
		
	/* Categorize clauses */
		* Binary clause variables
		cap drop cl_0* 
		foreach xxx of varlist cl* {
			gen b`xxx' = (`xxx'>0)&(`xxx'<.)
		}
		
		/* Total number of clauses */
		egen total_clauses = rowtotal(cl*)
		lab var total_clauses "Total number of clauses"
		egen total_clauses_b = rowtotal(bcl*)
		lab var total_clauses_b "Total number of clauses (binary)"

		* Childcare assistance, maternity assistance, abortion protections, maternity protections, paternity protections, policy for dependents
			egen maternity_clauses =rowtotal(cl_23aux_cre cl_23aux_mat cl_43est_abo cl_43est_mae cl_43est_pai cl_43pol_par)
			lab var maternity_clauses "Total maternity clauses"
			
			egen maternity_clauses_b =rowtotal(bcl_23aux_cre bcl_23aux_mat bcl_43est_abo bcl_43est_mae bcl_43est_pai bcl_43pol_par)
			lab var maternity_clauses_b "Total maternity clauses (binary)"
			
			gen any_maternity_cl = (maternity_clauses>0) & maternity_clauses!=. 
			lab var any_maternity_cl "Has any maternity/childcare clause"
			
		* Discrimination and harassment related clauses
		* Sexual harassment, equal opportunities
		
			egen harassment_clauses=rowtotal(cl_42ass_sex cl_42igu_opo)
			lab var harassment_clauses "Total harassment clauses"
			
			egen harassment_clauses_b=rowtotal(bcl_42ass_sex bcl_42igu_opo)
			lab var harassment_clauses_b "Total harassment clauses (binary)"
			
			gen any_harassment_cl = (harassment_clauses>0) & harassment_clauses!=.
			lab var any_harassment_cl "Has any harassment clause"

		* Flexibility related clauses
		* Part-time contracts, work authorization on Sundays and holidays, workday controls, 
		* weekly rest, duration and schedule, absences, break intervals, special shifts, other provisions
		* of the workday, extension/reduction of the workday, on-call, uninterrupted shifts
			
			* workday controls + special shifts (mostly about breast-feeding) + on-call + uninterrupted shifts
			egen flexibility_clauses = rowtotal(cl_51con_jor cl_51jor_esp cl_51sob cl_51tur_ini)
			* + part-time
			egen flexibility2_clauses = rowtotal(cl_51con_jor cl_51jor_esp cl_51sob cl_51tur_ini cl_32con_tem)
			
			lab var flexibility_clauses "Total flexiblity clauses"
			lab var flexibility2_clauses "Total flexiblity clauses + part-time"
			
			* workday controls + special shifts (mostly about breast-feeding) + on-call + uninterrupted shifts
			egen flexibility_clauses_b = rowtotal(bcl_51con_jor bcl_51jor_esp bcl_51sob bcl_51tur_ini)
			* + part-time
			egen flexibility2_clauses_b = rowtotal(bcl_51con_jor bcl_51jor_esp bcl_51sob bcl_51tur_ini bcl_32con_tem)
			
			lab var flexibility_clauses_b "Total flexiblity clauses (binary)"
			lab var flexibility2_clauses_b "Total flexiblity clauses + part-time (binary)"
			
			gen any_flexibility_cl = (flexibility_clauses>0) & flexibility_clauses!=.
			lab var any_flexibility_cl "Has any flexibility clause"
			
			gen any_flexibility2_cl = (flexibility2_clauses>0) & flexibility2_clauses!=.
			lab var any_flexibility2_cl "Has any flexibility clause + part-time"
			
		* Leave related clauses
		* female workforce, abortion leave, adoption leave, maternity leave, unpaid leave, paid leave, other provisions on holidays and leaves
			egen leave_clauses =rowtotal(cl_33maoobr_fem cl_72lic_abo cl_72lic_ado cl_72lic_mat cl_72lic_nao cl_72lic_rem cl_73out_dis)
			lab var leave_clauses "Total leave clauses"

			egen leave_clauses_b =rowtotal(bcl_33maoobr_fem bcl_72lic_abo bcl_72lic_ado bcl_72lic_mat bcl_72lic_nao bcl_72lic_rem bcl_73out_dis)
			lab var leave_clauses_b "Total leave clauses (binary)"
			
			gen any_leave_cl = (leave_clauses>0) & leave_clauses!=. 
			lab var any_leave_cl "Has any leave clause"
			
		* Any female-specific amenity

			gen female_clauses =  maternity_clauses + harassment_clauses + flexibility2_clauses + leave_clauses
			gen female_clauses_b =  maternity_clauses_b + harassment_clauses_b + flexibility2_clauses_b + leave_clauses_b
			gen any_female_clauses_cl = (female_clauses >0 ) & female_clauses!=. 
			
			gen nonfemale_clauses = total_clauses - female_clauses
			gen nonfemale_clauses_b = total_clauses_b - female_clauses_b

			lab var female_clauses "Total female-specific clauses"
			lab var female_clauses_b "Total female-specific clauses (binary)"
			lab var any_female_cl "Has any female-specific clause"
			
		* Share of female specific 
			gen female_sh = female_clauses/ total_clauses
			label var female_sh "% female-specific amenities"

			gen female_sh_b = female_clauses_b/ total_clauses_b
			label var female_sh_b "% female-specific amenities (binary)"
						
		* Shares of each type 
		
		global type maternity harassment flexibility flexibility2 leave

		foreach t of global type {
			gen `t'_sh = `t'_clauses/total_clauses
			replace `t'_sh = 0 if `t'_sh ==. 
			label var `t'_sh "% `t' clauses"
		}
		
		
		global type maternity harassment flexibility flexibility2 leave

		foreach t of global type {
			gen `t'_sh_b = `t'_clauses_b/total_clauses_b
			replace `t'_sh_b = 0 if `t'_sh_b ==. 
			label var `t'_sh_b "% `t' clauses (binary)"
		}
		

	* merge with treatment status of the union (same as new contracts)
	//pair and treat
	drop estab_union_pair
	preserve
		use  "$files/estab_union_act.dta", clear
		keep employer_id union_id treat estab_union_pair
		duplicates drop
		tempfile temp
		save `temp'
	restore
	merge m:1 employer_id union_id using `temp'
	drop if _m==2
	drop _merge
	//post status in 2015 from new contracts
	preserve
		use  "$files/estab_union_act.dta", clear
		keep if year==2015
		keep estab_union_pair post
		rename post post2015_new
		duplicates drop
		tempfile temp
		save `temp'
	restore
	merge m:1 estab_union_pair using `temp'
	drop if _m==2
	drop _merge
	
	gen post = (year>=2015)
	gen treatpost = treat * post 

	save "$files/filledpanel.dta", replace 
	erase "$files/temp_filledpanel.dta"
	
	* merge with estab chars
	use "$files/estab_union_act.dta", clear
	keep estab_union_pair only1union ind_mode microregion_mode state_mode min_cbayear
	duplicates drop
	tempfile oneu
	save `oneu'
	
	* merge with estab chars tag 
	use "$files/filledpanel.dta", clear
	merge m:1 estab_union_pair using `oneu' 
	drop if _m==2
	drop _merge 
	
	* 2digit industry
	cap drop temp 
	tostring ind_mode, gen(temp)
	gen lenind = length(temp) 
	replace temp = "0"* (5-lenind) + temp
	drop lenind
	gen lenind = length(temp) 
	tab lenind
	gen ind2d = substr(temp, 1, 2)
	tab ind2d // 59 values 
	drop temp lenind
	encode ind2d, gen(ind2d_num)
	
	xtset estab_union_pair year
	save "$files/filledpanel.dta", replace 


cap log close		
